/******************************************************************************
This file cleans the enrollment file for the analysis based on the demographic
files.
--------------------------------------------------------------------------------
Input:			Raw data set containing an overview on student's enrollment
					> {yyyy}_{yy}_June-Biog_PK-12_Scrambled.dta
					${cleandata}/demo_all_years_all_grade.dta
--------------------------------------------------------------------------------
Output:			Cleaned demographic file
					> demo_all_years_ms.dta
				Enrollment data
					> enroll_data_MS.dta
				Demographics by grade
					> demo_info_{g}th_grade.dta"
--------------------------------------------------------------------------------*/
clear all
set more off

local append = 1

if `append' == 1 {
	set obs 1
	gen  year = .
	tempfile demos
	save `demos'

	* Loop over years
	forval year = 2005/2018 {

		local digits_plus = `year' - 2000 + 1
		if length("`digits_plus'") == 1 local digits_plus 0`digits_plus'
		local next_yr = `year' +  1

		* Load raw data
		use "${cleandata}demos/JuneBiog/`year'-`digits_plus'_June-Biog_PK-12_Scrambled.dta", clear

		if `year' >= 2017{
			cap tostring disc_code, replace
			cap tostring admit_date, replace
			cap tostring disc_date, replace
			cap tostring ell, replace
			cap tostring swd, replace
			cap tostring poverty, replace


		}
		rename grade_level grade

		* Gen female info
		gen byte female = sex == "F"

		* Generate racial info
		gen byte asian 			= ethnicity == "Asian"
		gen byte black 			= ethnicity == "Black"
		gen byte hispanic 		= ethnicity == "Hispanic"
		gen byte nat_american	= ethnicity == "Native American"
		gen byte white	 		= ethnicity == "White"
		gen byte other 			= ethnicity == "."

		* Generate free and reduced price lunch info
		gen byte fr_lunch = real(poverty)

		* Gen spring year info
		gen year = `next_yr'

		append using `demos'

		save `demos', 	 replace
	}

	* Harmonize variable names
	ren (student_id dbn) (stu sch_long)
	* Create school code variable
	gen sch = substr(sch_long, 3, 4)

	* Drop if ID or grade is missing
	drop if stu == "" | grade == ""


	save "${cleandata}/demo_all_years_all_grade.dta", replace
}

	* save elementary and middle school demographics file
	use "${cleandata}/demo_all_years_all_grade.dta", clear
	keep if inlist(grade,"01","02","03","04","05","06","07","08")
	save "${cleandata}/demo_all_years_ms.dta", replace

	* Save grade-specific demographics file

	* limit to first attempt in grade
	bys stu grade: egen min_year = min(year)
	drop if year != min_year
	drop min_year

	* Generate indicator for borough
	gen enr_borough = substr(sch_long,3,1)

	* Loop over grades that potentially serve as baseline grades
	foreach gr in 05 {
		preserve
			* Limit sample to grade
			keep if grade == "`gr'"

			* Generate indicator for borough in grade
			gen enr_`gr'_borough = substr(sch_long, 3, 1)

			keep stu sch_long year asian black hispanic nat_american white other female swd ell fr_lunch enr_`gr'_borough

			* Rename variables
			foreach var in asian black hispanic nat_american white other female swd ell fr_lunch {
				ren `var' bl_`var'
			}

			* Generate indicator for borough names
			gen enr_`gr'_queens =  		enr_`gr'_borough == "Q"
			gen enr_`gr'_manhattan = 	enr_`gr'_borough == "M"
			gen enr_`gr'_brooklyn = 	enr_`gr'_borough == "K"
			gen enr_`gr'_staten = 		enr_`gr'_borough == "R"
			gen enr_`gr'_bronx = 		enr_`gr'_borough == "X"

			* Save
			sa "${cleandata}demo_info_`gr'th_grade.dta", replace

		restore
	}
